19  G-g Difference (JAGS)

Purpose: Model effects of monthly/annual water availability on Big-little difference using JAGS

19.1 Data

19.1.1 Site info and daily data

Code
# site information and locations
siteinfo <- read_csv("C:/Users/jbaldock/OneDrive - DOI/Documents/USGS/EcoDrought/EcoDrought Working/Data/EcoDrought_SiteInformation.csv")
siteinfo_sp <- st_as_sf(siteinfo, coords = c("long", "lat"), crs = 4326)
mapview(siteinfo_sp, zcol = "designation")
Code
# flow/yield (and temp) data 
dat <- read_csv("C:/Users/jbaldock/OneDrive - DOI/Documents/USGS/EcoDrought/EcoDrought Working/Data/EcoDrought_FlowTempData_DailyWeekly.csv") %>%
  filter(!site_name %in% c("WoundedBuckCreek", "Brackett Creek"))

# add water/climate year variables
dat <- add_date_variables(dat, dates = date, water_year_start = 4)

19.1.2 Separate and join G-g

Code
# pull out big G data
dat_big <- dat %>% filter(site_name %in% c("West Brook NWIS", "Paine Run 10", "Piney River 10", "Staunton River 10", "Spread Creek Dam", "BigCreekLower", "Shields River ab Smith NWIS", "Donner Blitzen River nr Frenchglen NWIS")) 

# pull out little G data, assign big G site name
dat_little <- dat %>% 
  filter(designation %in% c("little", "medium"), !subbasin %in% c("Coal Creek", "McGee Creek", "Duck Creek", "Flathead"),
         !site_name %in% c("West Brook NWIS", "West Brook 0", "Paine Run 10", "Piney River 10", "Staunton River 10", "Spread Creek Dam", "Shields River ab Smith NWIS", "BigCreekLower")) %>%
  mutate(site_name_big = ifelse(subbasin == "Big Creek", "BigCreekLower",
                                ifelse(subbasin == "West Brook", "West Brook NWIS", 
                                       ifelse(subbasin == "Paine Run", "Paine Run 10", 
                                              ifelse(subbasin == "Piney River", "Piney River 10",
                                                     ifelse(subbasin == "Staunton River", "Staunton River 10",
                                                            ifelse(subbasin == "Shields River", "Shields River ab Smith NWIS",
                                                                   ifelse(subbasin == "Snake River", "Spread Creek Dam", "Donner Blitzen River nr Frenchglen NWIS"))))))))

# Join big-little discharge data
dat_Gg <- dat_little %>%
  select(site_name, site_name_big, basin, subbasin, date, Month, MonthName, WaterYear, Yield_filled_mm_7) %>% rename(yield_little = Yield_filled_mm_7) %>%
  left_join(dat_big %>% select(site_name, date, Yield_filled_mm_7) %>% rename(site_name_big = site_name, yield_big = Yield_filled_mm_7)) %>%
  drop_na() %>% mutate(MonthName = as.character(MonthName))

# table
dat_Gg %>% group_by(subbasin) %>% summarise(site_name_big = unique(site_name_big)) %>% kable(caption = "Big G gage names for each focal sub-basin.")
Big G gage names for each focal sub-basin.
subbasin site_name_big
Big Creek BigCreekLower
Donner Blitzen Donner Blitzen River nr Frenchglen NWIS
Paine Run Paine Run 10
Shields River Shields River ab Smith NWIS
Snake River Spread Creek Dam
Staunton River Staunton River 10
West Brook West Brook NWIS

19.1.3 Calculate total yield

Code
# Get total annual and monthly Big G yield and join to KS diffs

# get total annual yield by big G site
totyield_annual <- dat_big %>% 
  group_by(site_name, WaterYear, basin, subbasin) %>% 
  summarise(days = n(), yield_annual = sum(Yield_filled_mm, na.rm = TRUE)) %>%
  filter(!is.na(yield_annual), days >= 360) %>%
  ungroup() %>%
  rename(site_name_big = site_name) %>%
  select(site_name_big, WaterYear, yield_annual) %>%
  ungroup()

# get total monthly yield by big G site
totyield_monthly <- dat_big %>% 
  group_by(site_name, WaterYear, basin, subbasin, Month) %>% 
  summarise(days = n(), yield_monthly = sum(Yield_filled_mm, na.rm = TRUE)) %>%
  filter(!is.na(yield_monthly), days >= 28) %>%
  ungroup() %>%
  rename(site_name_big = site_name) %>%
  select(site_name_big, WaterYear, Month, yield_monthly) %>%
  ungroup()

19.1.4 Final dataset

Join and filter G-g data to relevant basin(s) and years: currently, West Brook CY 2021 only!

Code
# wide format to enable direct difference calculation (Attempt 1)
dat_Gg2 <- dat_Gg %>% left_join(totyield_annual) %>% left_join(totyield_monthly) %>% 
  filter(basin == "West Brook", site_name != "Avery Broook NWIS", WaterYear == 2021) %>% 
  mutate(site_name_cd = as.numeric(as.factor(site_name)),
         z_log_yield_monthly = as.numeric(scale(log(yield_monthly), center = TRUE, scale = TRUE)),
         month_radian = as_radians((Month/12)*360))

# long format for more standard intercept model (Attempt 2)
dat_Gg3 <- dat_Gg2 %>% select(site_name, site_name_cd, Month, month_radian, WaterYear, yield_little, yield_big, z_log_yield_monthly) %>%
  gather(key = "ind", value = "yield", yield_little, yield_big) %>% mutate(indnum = as.numeric(as.factor(ind))-1)
head(dat_Gg3)
# A tibble: 6 × 9
  site_name  site_name_cd Month month_radian WaterYear z_log_yield_monthly ind  
  <chr>             <dbl> <dbl>        <dbl>     <dbl>               <dbl> <chr>
1 Avery Bro…            1     4         2.09      2021                1.18 yiel…
2 Avery Bro…            1     4         2.09      2021                1.18 yiel…
3 Avery Bro…            1     4         2.09      2021                1.18 yiel…
4 Avery Bro…            1     4         2.09      2021                1.18 yiel…
5 Avery Bro…            1     4         2.09      2021                1.18 yiel…
6 Avery Bro…            1     4         2.09      2021                1.18 yiel…
# ℹ 2 more variables: yield <dbl>, indnum <dbl>

Explore G-g data

Code
dat_Gg2 %>% group_by(WaterYear, Month) %>% summarise(z_log_yield_monthly = unique(z_log_yield_monthly)) %>% 
  ggplot(aes(x = Month, y = z_log_yield_monthly)) + geom_point() + geom_smooth()

Standardized (log) total monthly yield at Big G during CY 2021
Code
dat_Gg3 %>% ggplot() + geom_boxplot(aes(x = as.factor(Month), y = log(yield), fill = ind)) + facet_wrap(~site_name)

Distribution of (log) yield at G-g over time, by site
Code
dat_Gg2 %>% 
  group_by(site_name, Month) %>% 
  summarise(yield_little = mean(log(yield_little), na.rm = TRUE), 
            yield_big = mean(log(yield_big), na.rm = TRUE),
            z_log_yield_monthly = unique(z_log_yield_monthly)) %>%
  mutate(Qd = yield_little - yield_big) %>% 
  ggplot(aes(x = Month, y = Qd)) + geom_point() + geom_smooth() + ylab("mean[log(g) - log(G)]") + 
  facet_wrap(~site_name) + geom_abline(intercept = 0, slope = 0, linetype = "dashed")

G-g difference in (log) monthly mean yield over time
Code
dat_Gg2 %>% 
  group_by(site_name, Month) %>% 
  summarise(yield_little = mean(log(yield_little), na.rm = TRUE), 
            yield_big = mean(log(yield_big), na.rm = TRUE),
            z_log_yield_monthly = unique(z_log_yield_monthly)) %>%
  mutate(Qd = yield_little - yield_big) %>% 
  ggplot(aes(x = z_log_yield_monthly, y = Qd)) + geom_point() + geom_smooth(method = "lm") + 
  ylab("mean[log(g) - log(G)]") + facet_wrap(~site_name) + geom_abline(intercept = 0, slope = 0, linetype = "dashed")

G-g difference in (log) monthly mean yield as a function of (log) total monthly yield at G

19.2 Declare the JAGS model

First attempt tries to model the difference as a derived parameter (like the growth model), but this maintains the temporal structure of the data, and we are primarily interested in the difference in the distributions

Code
cat("model {

##--- LIKELIHOOD ---------------------------------------------------##

for (i in 1:nObs) {
  Qg[i] ~ dnorm(mu[i], pow(sigma, -2))
  mu[i] <- QG[i] - Qd[i]
  Qd[i] <- alpha[sites[i]] + beta[sites[i]] * yield[i]
  
  # Log-likelihood
  loglik[i] <- logdensity.norm(Qg[i], mu[i], pow(sigma, -2))
  }


##--- PRIORS --------------------------------------------------------##

# Process error is shared among sites
sigma ~ dunif(0.001, 100)

# Site-specific parameters
for (k in 1:nSites) {
  alpha[k] ~ dnorm(alpha.mu, pow(sigma.alpha, -2))
  beta[k] ~ dnorm(beta.mu, pow(sigma.beta, -2))
  }

# Global parameters
alpha.mu ~ dnorm(0, pow(10, -2))
beta.mu ~ dnorm(0, pow(10, -2))

# Site-level variation in alpha and beta
sigma.alpha ~ dunif(0.001, 100)
sigma.beta ~ dunif(0.001, 100)

}", file = "./Big G Little g/JAGS Models/GgMod.txt")

Second attempt is a more standard intercept model.

  • “alpha” is the mean monthly yield at Big-G, which is shared among sites
  • “beta1” is a Little-g offset to the intercept, which describes the site-level mean G-g difference
  • “beta2” describes the effect of water availability (log total monthly yield at Big G) on the site-level mean G-g difference.
    • Note that this is only “turned on” for little-g (ind[i] = 1)
  • Should add covariates on sigma to deal with unequal variance among groups
Code
cat("model {

##--- LIKELIHOOD ---------------------------------------------------##

for (i in 1:nObs) {
  Q[i] ~ dnorm(mu[i], pow(sigma, -2))
  mu[i] <- alpha[months[i]] + beta1[sites[i]] * ind[i] + beta2[sites[i]] * ind[i] * yieldtot[i]
  
  # Log-likelihood
  loglik[i] <- logdensity.norm(Q[i], mu[i], pow(sigma, -2))
  }


##--- PRIORS --------------------------------------------------------##

# Process error is shared among sites
sigma ~ dunif(0.001, 100)

# Site-specific parameters
for (j in 1:nSites) {
  beta1[j] ~ dnorm(0, pow(10, -2))
  beta2[j] ~ dnorm(0, pow(10, -2))
  }

for(k in 1:nMonths) {
  alpha[k] ~ dnorm(0, pow(10, -2))
  }

}", file = "C:/Users/jbaldock/OneDrive - DOI/Documents/USGS/EcoDrought/EcoDrought Working/EcoDrought-Analysis/Big G Little g/JAGS Models/GgMod.txt")

19.3 Fit the JAGS model

Code
# gather data for JAGS
# jags.data <- list("nObs" = dim(dat_Gg2)[1], "nSites" = length(unique(dat_Gg2$site_name_cd)), "sites" = dat_Gg2$site_name_cd, 
#                   "Qg" = dat_Gg2$yield_little, "QG" = dat_Gg2$yield_big, "yield" = dat_Gg2$z_yield_monthly)
jags.data <- list("nObs" = dim(dat_Gg3)[1], "nSites" = length(unique(dat_Gg3$site_name_cd)), "nMonths" = length(unique(dat_Gg3$Month)), 
                  "sites" = dat_Gg3$site_name_cd, "months" = dat_Gg3$Month,
                  "Q" = log(dat_Gg3$yield+0.01), "ind" = dat_Gg3$indnum, "yieldtot" = dat_Gg3$z_log_yield_monthly)

# parameters to monitor
# jags.params <- c("alpha", "alpha.mu", "sigma.alpha", "beta", "beta.mu", "sigma.beta", "sigma", "loglik")
jags.params <- c("alpha", "beta1", "beta2", "sigma", "loglik")

# run in jags
mod_0 <- jags.parallel(data = jags.data, inits = NULL, parameters.to.save = jags.params,
                       model.file = "C:/Users/jbaldock/OneDrive - DOI/Documents/USGS/EcoDrought/EcoDrought Working/EcoDrought-Analysis/Big G Little g/JAGS Models/GgMod.txt",
                       n.chains = 6, n.thin = 20, n.burnin = 1000, n.iter = 5000, DIC = TRUE)
# saveRDS(mod_0, "C:/Users/jbaldock/OneDrive - DOI/Documents/USGS/EcoDrought/EcoDrought Working/EcoDrought-Analysis/Big G Little g/JAGS Models/GgMod.RDS")

Any problematic R-hat values?

Code
mod_0$BUGSoutput$summary[,8][mod_0$BUGSoutput$summary[,8] > 1.01]
loglik[411] loglik[618] loglik[710] 
   1.010734    1.010492    1.010209 

19.3.1 View traceplots

Code
MCMCtrace(mod_0, ind = TRUE, params = c("alpha", "beta1", "beta2", "sigma"), pdf = FALSE)

19.3.2 Get MCMC samples and summary

Code
top_mod <- mod_0
# generate MCMC samples and store as an array
modelout <- top_mod$BUGSoutput
McmcList <- vector("list", length = dim(modelout$sims.array)[2])
for(i in 1:length(McmcList)) { McmcList[[i]] = as.mcmc(modelout$sims.array[,i,]) }
# rbind MCMC samples from 10 chains 
Mcmcdat <- rbind(McmcList[[1]], McmcList[[2]], McmcList[[3]], McmcList[[4]], McmcList[[5]], McmcList[[6]])
param.summary <- modelout$summary
head(param.summary)
                mean         sd       2.5%        25%         50%         75%
alpha[1]  0.59568531 0.02638917  0.5436847  0.5777065  0.59598800  0.61348801
alpha[2] -0.09285564 0.02604134 -0.1432750 -0.1101489 -0.09325746 -0.07525745
alpha[3]  0.49066445 0.02564605  0.4400611  0.4732223  0.49017450  0.50739094
alpha[4]  1.27031184 0.02555007  1.2214453  1.2523105  1.27030194  1.28782775
alpha[5]  0.55785916 0.02648229  0.5064006  0.5406703  0.55804078  0.57554557
alpha[6] -1.55636636 0.02595081 -1.6059092 -1.5743114 -1.55681564 -1.53864122
               97.5%      Rhat n.eff
alpha[1]  0.64672138 0.9993902  1200
alpha[2] -0.04258177 1.0030019   780
alpha[3]  0.53995110 1.0029285   790
alpha[4]  1.32114123 1.0022643   930
alpha[5]  0.61039691 1.0043291   650
alpha[6] -1.50654107 1.0000613  1200

19.4 Plot model output

Code
# control panel
nvals <- 100
nsim <- 50
nsites <- length(unique(dat_Gg3$site_name_cd))
x_seq <- seq(from = min(dat_Gg2$z_log_yield_monthly), to = max(dat_Gg2$z_log_yield_monthly), length.out = nvals)

# predict from model
pred_arr <- array(NA, dim = c(nsim, nvals, nsites))
for (k in 1:nsites) {
  for (j in 1:nsim) {
    pred_arr[j,,k] <- Mcmcdat[j,paste("beta1[", k, "]", sep = "")] + Mcmcdat[j,paste("beta2[", k, "]", sep = "")] * x_seq
  }
}
# pred_dist_lin <- matrix(NA, nrow = nrow(Mcmcdat), ncol = nrgs)
# for (i in 1:nrow(pred_dist_lin)) { pred_dist_lin[i,] <- exp(Mcmcdat[i,"alpha"] + Mcmcdat[i,"beta1"]*pdist) }
# pred_dist_tran <- matrix(NA, nrow = nrow(Mcmcdat), ncol = nrgs)
# for (i in 1:nrow(pred_dist_tran)) { pred_dist_tran[i,] <-  pred_dist_lin[i,] / rowSums(pred_dist_lin)[i] }
# pred_lower <- apply(pred_dist_tran, MARGIN = 2, quantile, prob = 0.025)
# pred_upper <- apply(pred_dist_tran, MARGIN = 2, quantile, prob = 0.975)
# pred_median <- apply(pred_dist_tran, MARGIN = 2, quantile, prob = 0.5)
Code
par(mar = c(5,5,2,12))
mycols <- brewer.pal(9, "Set1")
plot(seq(from = range(pred_arr)[1], to = range(pred_arr)[2], length.out = nvals) ~ x_seq, type = "n", xlab = "(log) Monthly total yield at Big G (z-score)", ylab = "Little g deviation from Big G")
for (k in 1:nsites) {
  for (j in 1:nsim) {
    lines(pred_arr[j,,k] ~ x_seq, col = alpha(mycols[k], 0.3))
  }
}
abline(h = 0, lty = 2)
par(xpd = TRUE)
legend("right", inset = c(-0.4,0), legend = unlist(dat_Gg3 %>% group_by(site_name) %>% summarise(stcd = unique(site_name_cd)) %>% select(site_name)), fill = mycols, bty = "n")

G-g mean difference in (log) yield as a function of (log) total monthly yield at G